import pandas as pd
import numpy as np
import seaborn as sns
import random
import plotly.offline as py
import plotly.graph_objects as go
import plotly.express as px
#import chart_studio.plotly as py
Introduction
Description and Simulation In this working sheet, I will create an artificial data set, Age of population, with a composition technique, In otherword, I’d create six sample sets of population with a different mean of age to portray each interval age, and after that, elaborating the data with descriptive statistic, enabling us to understand the data from different perspective
Construct a Data Set
"""np.random.normal o generate a vector of random values that follow a normal distribution
with a specific mean and standard deviation: mean, sd, size """
#Note: Because making up the data by implementing normoal distribution so some obeservation is negative
#but age can't be negative, need to handle that later
112)
random.seed(= np.random.normal(loc=5, scale=7, size=2000)
a = np.random.normal(loc=5, scale=7, size=2000)
b = np.random.normal(loc=35, scale=5, size=2500)
c = np.random.normal(loc=50, scale=8, size=3000)
d = np.random.normal(loc=70, scale=5, size=1000)
e = np.random.normal(loc=80, scale=7, size=1000) f
#combine all the vector
= np.concatenate((a, b, c, d, e, f)) pop
pop.shape
(11500,)
# making all data become absolute value and casting the type to int
= np.absolute(pop).astype(int) pop
# exclud age = 0
= pop[pop != 0] pop
pop
array([ 8, 4, 9, ..., 75, 87, 63])
# numbers of obeservation decreases not significantly, so it should be fine
pop.shape
(11136,)
= pd.DataFrame(pop, columns = ['Age']) df
= ['Male','Female'] sex
#random.choice() is a function using to pick a random value from a list
print(random.choice(sex))
Female
'Sex'] = random.choice(sex) # need to fill up an attribute with some values first
df['Sex'] = [ random.choice(sex) for i in df['Sex'] ] #apply random choice with list expressions df[
df
Age | Sex | |
---|---|---|
0 | 8 | Female |
1 | 4 | Female |
2 | 9 | Male |
3 | 1 | Female |
4 | 6 | Male |
... | ... | ... |
11131 | 67 | Male |
11132 | 73 | Female |
11133 | 75 | Female |
11134 | 87 | Male |
11135 | 63 | Female |
11136 rows × 2 columns
df
Age | Sex | |
---|---|---|
0 | 8 | Female |
1 | 4 | Female |
2 | 9 | Male |
3 | 1 | Female |
4 | 6 | Male |
... | ... | ... |
11131 | 67 | Male |
11132 | 73 | Female |
11133 | 75 | Female |
11134 | 87 | Male |
11135 | 63 | Female |
11136 rows × 2 columns
'Sex']).count() df.groupby([
Age | |
---|---|
Sex | |
Female | 5524 |
Male | 5612 |
Explore data wiht some visualizations
#Prepare the data
"""
I want to make a pyramid population catagorized by gender aging interval. First step is that
I might need to put each person into different bins depending on their age
"""
'\nI want to make a pyramid population catagorized by gender aging interval. First step is that \nI might need to put each person into different bins depending on their age\n'
# create the age_interval with 5 bins
'Age'].between(1, 20, 'both'), 'Age_Interval'] = '1-20'
df.loc[df['Age'].between(20, 40, 'right'), 'Age_Interval'] = '21-40'
df.loc[df['Age'].between(40, 60, 'right'), 'Age_Interval'] = '41-60'
df.loc[df['Age'].between(60, 80, 'right'), 'Age_Interval'] = '61-80'
df.loc[df['Age'].between(80, 100, 'right'), 'Age_Interval'] = '81-100'
df.loc[df[
#Credit: https://medium.com/towards-data-science/how-to-bin-numerical-data-with-pandas-fe5146c9dc55
df
Age | Sex | Age_Interval | |
---|---|---|---|
0 | 8 | Female | 1-20 |
1 | 4 | Female | 1-20 |
2 | 9 | Male | 1-20 |
3 | 1 | Female | 1-20 |
4 | 6 | Male | 1-20 |
... | ... | ... | ... |
11131 | 67 | Male | 61-80 |
11132 | 73 | Female | 61-80 |
11133 | 75 | Female | 61-80 |
11134 | 87 | Male | 81-100 |
11135 | 63 | Female | 61-80 |
11136 rows × 3 columns
= df.groupby(['Age_Interval','Sex'])[['Age']].count().reset_index().rename(columns={'Age':'Number_of_Pop'}) df1
#Noticing that the first attribute is repetitive, this is long format so we need to convert them to wide format
#For analysis purposes, mostly we want wide format, but ploting graphg by R or some analytical tool might require long format
df1
Age_Interval | Sex | Number_of_Pop | |
---|---|---|---|
0 | 1-20 | Female | 1812 |
1 | 1-20 | Male | 1775 |
2 | 21-40 | Female | 1295 |
3 | 21-40 | Male | 1379 |
4 | 41-60 | Female | 1314 |
5 | 41-60 | Male | 1334 |
6 | 61-80 | Female | 890 |
7 | 61-80 | Male | 886 |
8 | 81-100 | Female | 213 |
9 | 81-100 | Male | 238 |
#pivot method is used to convert from long to wide
=pd.pivot(df1,index='Age_Interval' ,columns='Sex', values='Number_of_Pop')
df2
#Credit: https://towardsdatascience.com/reshaping-a-pandas-dataframe-long-to-wide-and-vice-versa-517c7f0995ad
df2
Sex | Female | Male |
---|---|---|
Age_Interval | ||
1-20 | 1812 | 1775 |
21-40 | 1295 | 1379 |
41-60 | 1314 | 1334 |
61-80 | 890 | 886 |
81-100 | 213 | 238 |
'Female'].dtype df2[
dtype('int64')
= [i*-1 for i in df2['Female']] women_bins
len(women_bins)
5
= np.array(women_bins)
women_bins = np.array(df2['Male'])
men_bins
= df2.reset_index() # dropping Age_Interval from being an index
df3 =list(df3['Age_Interval']) # convert to list and utilize it as YAxis
y
= go.Layout(yaxis=go.layout.YAxis(title='Age'),
layout =go.layout.XAxis(
xaxisrange=[-2200, 2200],
=[-2000, -1500, -1000, -500, 0, 500, 1000, 1500, 2000],
tickvals=[2000, 1500, 1000, 500, 0, 500, 1000, 1500, 2000],
ticktext='Number_of_Population'),
title='overlay',
barmode=0.1)
bargap
= [go.Bar(y=y,
data =men_bins,
x='h',
orientation='Men',
name='x',
hoverinfo=dict(color='powderblue')
marker
),=y,
go.Bar(y=women_bins,
x='h',
orientation='Women',
name=-1 * women_bins.astype('int'),
text='text',
hoverinfo=dict(color='seagreen')
marker
)]
dict(data=data, layout=layout), filename='EXAMPLES/bar_pyramid') py.iplot(
Pyramid graph above giving an overall sense of population grouped by aga interval and sexuality. We can see that majority of population is in the range of 1-20 yeas old.
Measure of Central Tendency And Dispsersion of data
Now, I want to explore the central tendency in age of population.Thus, backing to work with data before catogorizing them intp different bin. So as to gain better understanding, I’d show some calculation to get descriptive statistic before using function to get those result
# age of individuals, population pop
array([ 8, 4, 9, ..., 75, 87, 63])
# number of populaiton pop.shape
(11136,)
= round(sum(pop)/len(pop))
mean_pop
mean_pop# Average age of population is 37 years old
37
np.median(pop)# Median of pop is 37
37.0
# defining a function to calculate mode. It
# takes list variable as argument
def mode(lst):
# creating a dictionary
= {}
freq for i in lst:
# mapping each value of list to a
# dictionary
0)
freq.setdefault(i, += 1
freq[i]
# finding maximum value of dictionary
= max(freq.values())
hf
# creating an empty list
= []
hflst
# using for loop we are checking for most
# repeated value
for i, j in freq.items():
if j == hf:
hflst.append(i)
# returning the result
return hflst
# calling mode() function and passing list
# as argument
print(mode(pop))
#Credit: https://www.geeksforgeeks.org/how-to-calculate-the-mode-of-numpy-array/
[1]
pop
array([ 8, 4, 9, ..., 75, 87, 63])
#Observing the diispersion of the data by geting the deviation
#With that, we substract each element from the mean
= []
dev for i in pop:
= i-mean_pop
temp dev.append(temp)
#deviation of mean
= np.array(dev)
dev dev
array([-29, -33, -28, ..., 38, 50, 26])
# the mean of deviation is usually be zero np.mean(dev)
-0.20510057471264367
#absolute deviation of mean
= abs(dev)
dev dev
array([29, 33, 28, ..., 38, 50, 26])
#Mean Absolute deviation or 'MAD'
np.mean(dev)print("Mean Absolute deviation is % s "
% (np.mean(dev)))
Mean Absolute deviation is 20.97162356321839
import statistics
= pop.tolist()
pop_list print("Standard Deviation of sample is % s "
% (statistics.stdev(pop_list)))
Standard Deviation of sample is 25.072255798045703
Mean absolute deviation (MAD) is a measure of the average absolute distance between each data value and the mean of a data set. Similar to standard deviation, MAD is a parameter or statistic that measures the spread, or variation, in your data.
Even Both MAD and SD measuring the spread of data,but SD is usually bigger than MAD as SD more sensitive to values that are farther away from the mean for more detail on MAD and SD: https://articles.outlier.org/mean-absolute-deviation-meaning
Describing Dispersion
#Using describe to see basic describtive measure ment
'Age'].describe() df[
count 11136.000000
mean 36.794899
std 25.072256
min 1.000000
25% 11.000000
50% 37.000000
75% 55.000000
max 100.000000
Name: Age, dtype: float64
#Range
range = max(df['Age'])-min(df['Age'])
print('Range is %s'
%range)
Range is 99
#Interquartile range = Q3 – Q1
= np.percentile(df.Age,[25,75])
q1, q3 = q3 - q1
iqr print(iqr)
44.0
Detecting Outlier
definition of outlier here is any point of data whihc is beyond the line of lower limit(Q1 - 1.5IQR) or upper limit(Q3 + 1.5IQR)
#firn lower limit and upper limit
= q1 - (1.5*iqr)
lower_limit = q3 + (1.5*iqr)
upper_limit print(lower_limit, upper_limit)
# this could be conclude that if anyone in our population is, at age, more than 120 years old could be considered as outlier
-55.0 121.0
# try selecting a sample set and consider its statistic measurement
= df.Age.tolist()
age_list = random.sample(age_list,2500) sample
= pd.DataFrame(sample)
sample sample.describe()
0 | |
---|---|
count | 2500.000000 |
mean | 36.736800 |
std | 24.606489 |
min | 1.000000 |
25% | 11.000000 |
50% | 37.000000 |
75% | 54.000000 |
max | 94.000000 |
'Age'].describe() df[
count 11136.000000
mean 36.794899
std 25.072256
min 1.000000
25% 11.000000
50% 37.000000
75% 55.000000
max 100.000000
Name: Age, dtype: float64
# the statistical measurement of sample set and population are quite similar
# the sample set well represent the population
df
Age | Sex | Age_Interval | |
---|---|---|---|
0 | 8 | Female | 1-20 |
1 | 4 | Female | 1-20 |
2 | 9 | Male | 1-20 |
3 | 1 | Female | 1-20 |
4 | 6 | Male | 1-20 |
... | ... | ... | ... |
11131 | 67 | Male | 61-80 |
11132 | 73 | Female | 61-80 |
11133 | 75 | Female | 61-80 |
11134 | 87 | Male | 81-100 |
11135 | 63 | Female | 61-80 |
11136 rows × 3 columns
# performing visualization of population using boxplot, categotized by sexuality
= df
df_box = px.box(df_box, x="Sex", y="Age")
fig
fig.show()
# from out artificial data here, I'd try adding 'country attribute', and making more dynamic visualization
df
Age | Sex | Age_Interval | |
---|---|---|---|
0 | 8 | Female | 1-20 |
1 | 4 | Female | 1-20 |
2 | 9 | Male | 1-20 |
3 | 1 | Female | 1-20 |
4 | 6 | Male | 1-20 |
... | ... | ... | ... |
11131 | 67 | Male | 61-80 |
11132 | 73 | Female | 61-80 |
11133 | 75 | Female | 61-80 |
11134 | 87 | Male | 81-100 |
11135 | 63 | Female | 61-80 |
11136 rows × 3 columns
= ['Thailand', 'Taiwan', 'Japan', 'Germany'] country
'Country'] = random.choice(country)
df['Country'] = [ random.choice(country) for i in df['Country'] ] df[
df
Age | Sex | Age_Interval | Country | |
---|---|---|---|---|
0 | 8 | Female | 1-20 | Germany |
1 | 4 | Female | 1-20 | Japan |
2 | 9 | Male | 1-20 | Japan |
3 | 1 | Female | 1-20 | Germany |
4 | 6 | Male | 1-20 | Japan |
... | ... | ... | ... | ... |
11131 | 67 | Male | 61-80 | Taiwan |
11132 | 73 | Female | 61-80 | Japan |
11133 | 75 | Female | 61-80 | Japan |
11134 | 87 | Male | 81-100 | Thailand |
11135 | 63 | Female | 61-80 | Thailand |
11136 rows × 4 columns
from dash import Dash, dcc, html, Input, Output
from jupyter_dash import JupyterDash
= JupyterDash(__name__)
app
= html.Div([
app.layout "Analysis of Age Distribution in Population"),
html.H4("x-axis:"),
html.P(
dcc.Checklist(id='x-axis',
=['Country', 'Sex'],
options=True
inline
),"y-axis:"),
html.P(
dcc.RadioItems(id='y-axis',
='Age',
value=True
inline
),id="graph"),
dcc.Graph(
])
@app.callback(
"graph", "figure"),
Output("x-axis", "value"),
Input("y-axis", "value"))
Input(def generate_chart(x, y):
= df # replace with your own data source
df_box = px.box(df, x=x, y=y)
fig return fig
if __name__ == '__main__':
="inline") app.run_server(mode